This script focuses on the similarity analysis between BE and other tissues: NE, NSCJ, BSCJ, NG and SMG.
library(scran)
library(scater)
library(DropletUtils)
library(openxlsx)
library(Rtsne)
library(pheatmap)
library(cowplot)
library(RColorBrewer)
library(edgeR)
library(ape)
library(viridis)
library(umap)
source("../Analysis/Functions/auxiliary.R")
sce <- readRDS("~/Dropbox/Postdoc/2019-12-29_BE2020/All_corrected_sce_filtered.rds")
# Exclude contaminating cells
sce <- sce[,sce$include]
# Exclude duodenum cells
sce <- sce[,colData(sce)$Tissue != "ND"]
Here, we visualize the batch-corrected counts using tsne.
# # Batch correction
# sce.list <- split.sce(sce, unique(sce$Sample), colData.name = "Sample")
#
# # Order sce objects for batch correction
# sce.list <- sce.list[order(unlist(lapply(sce.list, ncol)), decreasing = TRUE)]
#
# n <- names(sce.list)
# sce.list <- sce.list[c(which(grepl("NSCJ_out", n)), which(grepl("BSCJ_out", n)),
# which(grepl("NE_out", n)), which(grepl("NG_out", n)),
# which(grepl("BE_out", n)), which(grepl("SMG_out", n)))]
#
# corrected <- batch.correction(sce.list)
# sce <- do.call("cbind", sce.list)
# order the samples by size of the sample
n <- names(sort(table(sce[["Sample"]]), decreasing = TRUE))
n <- n[c(which(grepl("NSCJ_out", n)), which(grepl("BSCJ_out", n)),
which(grepl("NE_out", n)), which(grepl("NG_out", n)),
which(grepl("BE_out", n)), which(grepl("SMG_out", n)))]
sce<-batchelor::multiBatchNorm(sce, batch = sce[["Sample"]])
# The samples are in NSCJ, NE, NG, SMG order
corrected <- batch.correction.single(sce, batches = "Sample", m.order = n)
# Compute new tSNE
set.seed(1234)
tsne <- Rtsne(t(corrected), pca = FALSE, perplexity = 250)
umap <-umap(t(corrected))
reducedDims(sce)$TSNE <- tsne$Y
reducedDims(sce)$umap <- umap$layout
# Clustering on corrected data
g <- buildSNNGraph(corrected, k = 10)
clusters <- igraph::cluster_louvain(g)$membership
# Check what clusters overlap with nonepithelial cell types
tmp<-reshape2::dcast(plyr::count(cbind(clusters,sce$tissue_type)), formula = x.clusters ~ x.V2, fill = 0)
## Using freq as value column: use value.var to override.
tmp<-tmp$x.clusters[tmp$NonEpithelial == apply(as.matrix(tmp[,2:5]), 1, max)]
# View(tmp)
# Exclude immune and stromal clusters
ImmuneTF <- clusters
ImmuneTF <- ifelse(ImmuneTF %in% tmp, "nonepi", "epi")
# Visualize clustering with low alpha for immune and stromal cells
ggplot(data.frame(tSNE1 = tsne$Y[,1],
tSNE2 = tsne$Y[,2],
clusters = as.factor(clusters),
ImmuneTF =ImmuneTF)) +
geom_point(aes(tSNE1, tSNE2, colour = clusters, alpha = ImmuneTF), size = 0.5) +
scale_alpha_manual(values = c("epi" = 1 , "nonepi" = 0.05))
# Visualize clustering with low alpha for immune and stromal cells - umap
ggplot(data.frame(UMAP1 = umap$layout[,1],
UMAP2 = umap$layout[,2],
clusters = as.factor(clusters),
ImmuneTF =ImmuneTF)) +
geom_point(aes(UMAP1, UMAP2, colour = clusters, alpha = ImmuneTF), size = 0.5) +
scale_alpha_manual(values = c("epi" = 1 , "nonepi" = 0.01))
# Visualize tsne
p.all.cells <- ggplot(data.frame(tsne1 = reducedDims(sce)$TSNE[,1],
tsne2 = reducedDims(sce)$TSNE[,2],
tissue = colData(sce)$Tissue,
ImmuneTF =ImmuneTF)) +
geom_point(aes(tsne1, tsne2, alpha = ImmuneTF), colour = "black", size = 1) +
geom_point(aes(tsne1, tsne2, colour = tissue, alpha = ImmuneTF), size = 0.5) +
scale_color_manual(values = metadata(sce)$colour_vector) +
scale_alpha_manual(values = c("epi" = 1 , "nonepi" = 0.05)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
) +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
guides(colour = guide_legend(override.aes = list(size=4), title = "Tissue")
)
p.all.cells
# Visualize tsne random order
jittered <- sample(ncol(sce))
p.all.cells_random <- ggplot(data.frame(tsne1 = reducedDims(sce)$TSNE[jittered,1],
tsne2 = reducedDims(sce)$TSNE[jittered,2],
tissue = colData(sce)$Tissue[jittered],
ImmuneTF =ImmuneTF[jittered])) +
geom_point(aes(tsne1, tsne2, alpha = ImmuneTF), colour = "black", size = 1) +
geom_point(aes(tsne1, tsne2, colour = tissue, alpha = ImmuneTF), size = 0.5) +
scale_color_manual(values = metadata(sce)$colour_vector) +
scale_alpha_manual(values = c("epi" = 1 , "nonepi" = 0.05)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
) +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
guides(colour = guide_legend(override.aes = list(size=4), title = "Tissue")
)
p.all.cells_random
# Visualize umap
p.all.cells.umap <- ggplot(data.frame(UMAP1 = umap$layout[,1],
UMAP2 = umap$layout[,2],
tissue = colData(sce)$Tissue,
ImmuneTF =ImmuneTF)) +
geom_point(aes(UMAP1, UMAP2, alpha = ImmuneTF), colour = "black", size = 1) +
geom_point(aes(UMAP1, UMAP2, colour = tissue, alpha = ImmuneTF), size = 0.5) +
scale_color_manual(values = metadata(sce)$colour_vector) +
scale_alpha_manual(values = c("epi" = 1 , "nonepi" = 0.01)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
) +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
guides(colour = guide_legend(override.aes = list(size=4), title = "Tissue")
)
p.all.cells.umap
# Visualize umap in random rder
p.all.cells.umap_random <- ggplot(data.frame(UMAP1 = umap$layout[jittered,1],
UMAP2 = umap$layout[jittered,2],
tissue = colData(sce)$Tissue[jittered],
ImmuneTF =ImmuneTF[jittered])) +
geom_point(aes(UMAP1, UMAP2, alpha = ImmuneTF), colour = "black", size = 1) +
geom_point(aes(UMAP1, UMAP2, colour = tissue, alpha = ImmuneTF), size = 0.5) +
scale_color_manual(values = metadata(sce)$colour_vector) +
scale_alpha_manual(values = c("epi" = 1 , "nonepi" = 0.01)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
) +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
guides(colour = guide_legend(override.aes = list(size=4), title = "Tissue")
)
p.all.cells.umap_random
#Identify the immune and stromal cells based on the manual annotation
ImmuneTF <- colData(sce)$cell_type
ImmuneTF[!(ImmuneTF %in% c("Immune", "Stromal"))] <- "epi"
# Visualize tsne
ggplot(data.frame(tsne1 = reducedDims(sce)$TSNE[,1],
tsne2 = reducedDims(sce)$TSNE[,2],
tissue = colData(sce)$Tissue,
ImmuneTF =ImmuneTF)) +
geom_point(aes(tsne1, tsne2, alpha = ImmuneTF), colour = "black", size = 1) +
geom_point(aes(tsne1, tsne2, colour = tissue, alpha = ImmuneTF), size = 0.5) +
scale_color_manual(values = metadata(sce)$colour_vector) +
scale_alpha_manual(values = c("epi" = 1 , "Immune" = 0.05, "Stromal" = 0.05)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
) +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
guides(colour = guide_legend(override.aes = list(size=4), title = "Tissue")
)
# Visualize umap
ggplot(data.frame(UMAP1 = umap$layout[,1],
UMAP2 = umap$layout[,2],
tissue = colData(sce)$Tissue,
ImmuneTF =ImmuneTF)) +
geom_point(aes(UMAP1, UMAP2, alpha = ImmuneTF), colour = "black", size = 1) +
geom_point(aes(UMAP1, UMAP2, colour = tissue, alpha = ImmuneTF), size = 0.5) +
scale_color_manual(values = metadata(sce)$colour_vector) +
scale_alpha_manual(values = c("epi" = 1 , "Immune" = 0.01, "Stromal" = 0.01)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.background = element_blank(),
legend.key = element_blank()
) +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
guides(colour = guide_legend(override.aes = list(size=4), title = "Tissue")
)
# visualise individual gene expression
gene<-"CHGA"
cur_exp <- ggplot(data.frame(tSNE1 = reducedDims(sce)$TSNE[,1],
tSNE2 = reducedDims(sce)$TSNE[,2],
clusters = as.factor(colData(sce)$Tissue_cluster),
expression = logcounts(sce)[rowData(sce)$Symbol == gene,])) +
geom_point(aes(tSNE1, tSNE2, colour = expression)) +
scale_colour_viridis(name = "log2(Expr)") + ggtitle(gene)
print(cur_exp)
gene<-"CHGA"
cur_exp <- ggplot(data.frame(UMAP1 = umap$layout[,1],
UMAP2 = umap$layout[,2],
clusters = as.factor(colData(sce)$Tissue_cluster),
expression = logcounts(sce)[rowData(sce)$Symbol == gene,])) +
geom_point(aes(UMAP1, UMAP2, colour = expression)) +
scale_colour_viridis(name = "log2(Expr)") + ggtitle(gene)
print(cur_exp)
gene<-"MUC2"
cur_exp <- ggplot(data.frame(tSNE1 = reducedDims(sce)$TSNE[,1],
tSNE2 = reducedDims(sce)$TSNE[,2],
clusters = as.factor(colData(sce)$Tissue_cluster),
expression = logcounts(sce)[rowData(sce)$Symbol == gene,])) +
geom_point(aes(tSNE1, tSNE2, colour = expression)) +
scale_colour_viridis(name = "log2(Expr)") + ggtitle(gene)
print(cur_exp)
gene<-"MUC2"
cur_exp <- ggplot(data.frame(UMAP1 = umap$layout[,1],
UMAP2 = umap$layout[,2],
clusters = as.factor(colData(sce)$Tissue_cluster),
expression = logcounts(sce)[rowData(sce)$Symbol == gene,])) +
geom_point(aes(UMAP1, UMAP2, colour = expression)) +
scale_colour_viridis(name = "log2(Expr)") + ggtitle(gene)
print(cur_exp)
ggsave("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/All_cells_noND.pdf", p.all.cells, width = 7, height = 5, useDingbats = FALSE)
ggsave("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/All_cells_noND_umap.pdf", p.all.cells.umap, width = 7, height = 5, useDingbats = FALSE)
ggsave("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/All_cells_noND_random.pdf", p.all.cells_random, width = 7, height = 5, useDingbats = FALSE)
ggsave("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/All_cells_noND_umap_random.pdf", p.all.cells.umap_random, width = 7, height = 5, useDingbats = FALSE)
To estimate the similarity between all cell types, we compute an averaged expression profile for each cell-type and perform hierarchical cluster analysis.
# We will perform this analysis on the corrected and uncorrected counts (uncorrected in supplements)
clusters <- paste(colData(sce)$Tissue, colData(sce)$Tissue_cluster, sep = "_")
mat <- matrix(data = NA, ncol = length(unique(clusters)), nrow = nrow(corrected))
colnames(mat) <- unique(clusters)
for(i in unique(clusters)){
mat[,i] <- rowMeans(corrected[,clusters == i])
}
# # Alternative calculation of average consensus
# for(i in unique(clusters)){
# tmp<-apply(corrected[,clusters == i], 1, function (x) {aggregate(x, by = list(colData(sce)$Sample[clusters ==i]), mean)})
# mat[,i] <- unlist(lapply(tmp, function(x){mean(x$x)}))
# }
# Rename the matrix to contain the actual cell-type labels
for(i in 1:ncol(mat)){
colnames(mat)[i] <- unique(sce$cell_type[sce$Tissue == sub("_[0-9]*$", "", colnames(mat)[i]) &
sce$Tissue_cluster == sub("^[A-Z2]*_", "", colnames(mat)[i])])
}
# Calculate euclidean distance on corrected counts
dend <- hclust(dist(t(mat), method = "euclidean"), method = "ward.D2")
plot(as.phylo(dend), type = "fan",
tip.color = metadata(sce)$colour_vector[sub("_[0-9]*$", "", unique(clusters))])
dev.off()
## null device
## 1
pdf("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/Tree_corrected_counts_noND.pdf", width = 10, height = 10, useDingbats = FALSE)
plot(as.phylo(dend), type = "fan",
tip.color = metadata(sce)$colour_vector[sub("_[0-9]*$", "", unique(clusters))])
dev.off()
## null device
## 1
To estimate the similarity between all cell types, we compute an averaged expression profile for each cell-type and perform hierarchical cluster analysis.
# We will perform this analysis on the corrected and uncorrected counts (uncorrected in supplements)
HVG.genes<-modelGeneVar(sce, block = sce[["Sample"]])
genes<- getTopHVGs(HVG.genes, n = 1000)
clusters <- paste(colData(sce)$Tissue, colData(sce)$Tissue_cluster, sep = "_")
mat <- matrix(data = NA, ncol = length(unique(clusters)), nrow = 1000)
colnames(mat) <- unique(clusters)
# for(i in unique(clusters)){
# mat[,i] <- rowMeans(corrected[,clusters == i])
# }
# Alternative calculation of average consensus
for(i in unique(clusters)){
tmp<-apply(logcounts(sce)[genes,clusters == i], 1, function (x) {aggregate(x, by = list(colData(sce)$Sample[clusters ==i]), mean)})
mat[,i] <- unlist(lapply(tmp, function(x){mean(x$x)}))
}
# Rename the matrix to contain the actual cell-type labels
for(i in 1:ncol(mat)){
colnames(mat)[i] <- unique(sce$cell_type[sce$Tissue == sub("_[0-9]*$", "", colnames(mat)[i]) &
sce$Tissue_cluster == sub("^[A-Z2]*_", "", colnames(mat)[i])])
}
# Calculate euclidean distance on corrected counts
# dend <- hclust(dist(t(mat), method = "euclidean"), method = "ward.D2")
dend <- hclust(as.dist(sqrt(1 - cor(mat, method = "spearman"))/2), method = "ward.D2")
plot(as.phylo(dend), type = "fan",
tip.color = metadata(sce)$colour_vector[sub("_[0-9]*$", "", unique(clusters))])
pheatmap(mat, scale = "row", clustering_distance_cols = "correlation", clustering_method = "ward.D2")
# dev.off()
# pdf("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/Tree_corrected_counts_noND.pdf", width = 10, height = 10, useDingbats = FALSE)
# plot(as.phylo(dend), type = "fan",
# tip.color = metadata(sce)$colour_vector[sub("_[0-9]*$", "", unique(clusters))])
# dev.off()
To estimate the similarity between all cell types, we compute an averaged expression profile for each cell-type and perform hierarchical cluster analysis.
# We will perform this analysis on the corrected and uncorrected counts (uncorrected in supplements)
clusters <- paste(colData(sce)$Tissue, colData(sce)$Tissue_cluster, sep = "_")
mat <- matrix(data = NA, ncol = length(unique(clusters)), nrow = nrow(corrected))
colnames(mat) <- unique(clusters)
for(i in unique(clusters)){
mat[,i] <- rowMeans(corrected[,clusters == i])
}
# # Alternative calculation of average consensus
# for(i in unique(clusters)){
# tmp<-apply(corrected[,clusters == i], 1, function (x) {aggregate(x, by = list(colData(sce)$Sample[clusters ==i]), mean)})
# mat[,i] <- unlist(lapply(tmp, function(x){mean(x$x)}))
# }
# Rename the matrix to contain the actual cell-type labels
for(i in 1:ncol(mat)){
colnames(mat)[i] <- unique(sce$cell_type_secondary[sce$Tissue == sub("_[0-9]*$", "", colnames(mat)[i]) &
sce$Tissue_cluster == sub("^[A-Z2]*_", "", colnames(mat)[i])])
}
# Calculate euclidean distance on corrected counts
dend <- hclust(dist(t(mat), method = "euclidean"), method = "ward.D2")
plot(as.phylo(dend), type = "fan",
tip.color = metadata(sce)$colour_vector[sub("_[0-9]*$", "", unique(clusters))])
dev.off()
## null device
## 1
pdf("~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Figure_3/Tree_corrected_counts_noND_alt_names.pdf", width = 10, height = 10, useDingbats = FALSE)
plot(as.phylo(dend), type = "fan",
tip.color = metadata(sce)$colour_vector[sub("_[0-9]*$", "", unique(clusters))])
dev.off()
## null device
## 1
To finish get session info:
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 31 (Workstation Edition)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] destiny_3.0.1 dbscan_1.1-5
## [3] princurve_2.1.4 dynamicTreeCut_1.63-1
## [5] umap_0.2.4.1 viridis_0.5.1
## [7] viridisLite_0.3.0 ape_5.3
## [9] edgeR_3.28.0 limma_3.42.2
## [11] RColorBrewer_1.1-2 cowplot_1.0.0
## [13] pheatmap_1.0.12 Rtsne_0.15
## [15] openxlsx_4.1.4 DropletUtils_1.6.1
## [17] scater_1.14.6 ggplot2_3.2.1
## [19] scran_1.14.6 SingleCellExperiment_1.8.0
## [21] SummarizedExperiment_1.16.1 DelayedArray_0.12.2
## [23] BiocParallel_1.20.1 matrixStats_0.55.0
## [25] Biobase_2.46.0 GenomicRanges_1.38.0
## [27] GenomeInfoDb_1.22.0 IRanges_2.20.2
## [29] S4Vectors_0.24.3 BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 RcppEigen_0.3.3.7.0 plyr_1.8.5
## [4] igraph_1.2.4.2 lazyeval_0.2.2 sp_1.3-2
## [7] RcppHNSW_0.2.0 digest_0.6.24 htmltools_0.4.0
## [10] magrittr_1.5 R.utils_2.9.2 xts_0.12-0
## [13] askpass_1.1 colorspace_1.4-1 rappdirs_0.3.1
## [16] haven_2.2.0 xfun_0.12 dplyr_0.8.4
## [19] crayon_1.3.4 RCurl_1.98-1.1 jsonlite_1.6.1
## [22] hexbin_1.28.1 zoo_1.8-7 glue_1.3.1
## [25] gtable_0.3.0 zlibbioc_1.32.0 XVector_0.26.0
## [28] car_3.0-6 BiocSingular_1.2.1 Rhdf5lib_1.8.0
## [31] DEoptimR_1.0-8 HDF5Array_1.14.2 abind_1.4-5
## [34] VIM_5.1.0 scales_1.1.0 ggplot.multistats_1.0.0
## [37] ggthemes_4.2.0 Rcpp_1.0.3 laeken_0.5.1
## [40] reticulate_1.14 dqrng_0.2.1 foreign_0.8-72
## [43] rsvd_1.0.2 proxy_0.4-23 vcd_1.4-5
## [46] farver_2.0.3 pkgconfig_2.0.3 R.methodsS3_1.8.0
## [49] nnet_7.3-12 locfit_1.5-9.1 labeling_0.3
## [52] reshape2_1.4.3 tidyselect_1.0.0 rlang_0.4.4
## [55] munsell_0.5.0 cellranger_1.1.0 tools_3.6.2
## [58] ranger_0.12.1 batchelor_1.2.4 evaluate_0.14
## [61] stringr_1.4.0 yaml_2.2.1 knitr_1.28
## [64] zip_2.0.4 robustbase_0.93-5 purrr_0.3.3
## [67] nlme_3.1-142 formatR_1.7 R.oo_1.23.0
## [70] compiler_3.6.2 beeswarm_0.2.3 curl_4.3
## [73] e1071_1.7-3 smoother_1.1 tibble_2.1.3
## [76] statmod_1.4.33 stringi_1.4.5 RSpectra_0.16-0
## [79] forcats_0.4.0 lattice_0.20-38 Matrix_1.2-18
## [82] vctrs_0.2.2 pillar_1.4.3 lifecycle_0.1.0
## [85] lmtest_0.9-37 BiocNeighbors_1.4.1 data.table_1.12.8
## [88] bitops_1.0-6 irlba_2.3.3 R6_2.4.1
## [91] pcaMethods_1.78.0 gridExtra_2.3 rio_0.5.16
## [94] vipor_0.4.5 codetools_0.2-16 boot_1.3-23
## [97] MASS_7.3-51.4 assertthat_0.2.1 rhdf5_2.30.1
## [100] openssl_1.4.1 withr_2.1.2 GenomeInfoDbData_1.2.2
## [103] hms_0.5.3 grid_3.6.2 tidyr_1.0.2
## [106] class_7.3-15 rmarkdown_2.1 DelayedMatrixStats_1.8.0
## [109] carData_3.0-3 TTR_0.23-6 scatterplot3d_0.3-41
## [112] ggbeeswarm_0.6.0